package com.bruce.tool.address.spider.resolver;

import com.bruce.tool.common.exception.BaseRuntimeException;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;

import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 功能 :
 * html乱码解析
 * @author : Bruce(刘正航) 3:03 PM 2018/11/19
 */
@Slf4j
@NoArgsConstructor(access = AccessLevel.PRIVATE)
public class HtmlDecoder {
    /**根据获取到的网页字节,根据content-Type,解码对应的内容**/
    public static String decodeHtml(byte[] first) {
        try {
            if( Objects.isNull(first) ){ return ""; }
            if( first.length == 0 ){ return ""; }
            byte[] headerBytes = new byte[256];
            if( first.length < 256 ){
                headerBytes = new byte[first.length];
            }
            System.arraycopy(first,0,headerBytes,0,headerBytes.length);
            String header = new String(first,StandardCharsets.ISO_8859_1);

            List<String> regexgroups = new ArrayList<>();
            String regex = "content=\"text/html; charset=(iso-8859-1|iso8859-1|gb2312|GB2312|gbk|GBK|utf-8|UTF-8)\"";
            filterByRegex(regex, regexgroups, header);
            String charset = "charset=utf-8";
            if(!CollectionUtils.isEmpty(regexgroups)){
                charset = regexgroups.get(0);
            }
            String content = "";
            if( charset.toLowerCase().contains("gb2312")){
                content = new String(first,"gbk");
            }
            if( charset.toLowerCase().contains("gbk")){
                content = new String(first,"gbk");
            }
            if( charset.toLowerCase().contains("utf-8")){
                content = new String(first, StandardCharsets.UTF_8);
            }
            if( charset.toLowerCase().contains("ios8859-1")){
                content = new String(first,StandardCharsets.ISO_8859_1);
            }
            if( charset.toLowerCase().contains("ios-8859-1")){
                content = new String(first,StandardCharsets.ISO_8859_1);
            }
            return content;
        } catch (UnsupportedEncodingException e) {
            throw new BaseRuntimeException(e);
        }
    }

    /**根据正则表达式,从内容中获取指定值**/
    public static void filterByRegex(String regex, List<String> regexgroups, String content) {
        Pattern p1 = Pattern.compile(regex);
        Matcher m1 = p1.matcher(content);
        while (m1.find()) {
            for (int i = 0; i < m1.groupCount(); i++) {
                String group = m1.group(i);
                regexgroups.add(group);
            }
        }
    }
}
